Setup

## [1] "**** Utilized Cores **** = 12"

Load Libraries & Report Versions

## Error in library(kableExtra): there is no package called 'kableExtra'
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS release 6.9 (Final)
## 
## Matrix products: default
## BLAS/LAPACK: /hpc/packages/minerva-common/intel/parallel_studio_xe_2018/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] biomaRt_2.36.1 knitr_1.20     bindrcpp_0.2.2 gridExtra_2.3 
## [5] dplyr_0.7.6    Seurat_2.3.4   Matrix_1.2-14  cowplot_0.9.3 
## [9] ggplot2_3.0.0 
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.13           colorspace_1.3-2     class_7.3-14        
##   [4] modeltools_0.2-22    ggridges_0.5.0       mclust_5.4.1        
##   [7] rprojroot_1.3-2      htmlTable_1.12       base64enc_0.1-3     
##  [10] rstudioapi_0.7       proxy_0.4-22         flexmix_2.3-14      
##  [13] bit64_0.9-7          AnnotationDbi_1.42.1 mvtnorm_1.0-8       
##  [16] codetools_0.2-15     splines_3.5.1        R.methodsS3_1.7.1   
##  [19] robustbase_0.93-1.1  Formula_1.2-3        jsonlite_1.5        
##  [22] ica_1.0-2            cluster_2.0.7-1      kernlab_0.9-26      
##  [25] png_0.1-7            R.oo_1.22.0          compiler_3.5.1      
##  [28] httr_1.3.1           backports_1.1.2      assertthat_0.2.0    
##  [31] lazyeval_0.2.1       prettyunits_1.0.2    lars_1.2            
##  [34] acepack_1.4.1        htmltools_0.3.6      tools_3.5.1         
##  [37] igraph_1.2.1         gtable_0.2.0         glue_1.3.0          
##  [40] RANN_2.6             reshape2_1.4.3       Rcpp_1.0.0          
##  [43] Biobase_2.40.0       trimcluster_0.1-2.1  gdata_2.18.0        
##  [46] ape_5.1              nlme_3.1-137         iterators_1.0.10    
##  [49] fpc_2.1-11           lmtest_0.9-36        stringr_1.3.1       
##  [52] irlba_2.3.2          gtools_3.8.1         XML_3.98-1.12       
##  [55] DEoptimR_1.0-8       MASS_7.3-50          zoo_1.8-3           
##  [58] scales_0.5.0         hms_0.4.2            doSNOW_1.0.16       
##  [61] parallel_3.5.1       RColorBrewer_1.1-2   yaml_2.1.19         
##  [64] memoise_1.1.0        reticulate_1.7       pbapply_1.3-4       
##  [67] rpart_4.1-13         segmented_0.5-3.0    latticeExtra_0.6-28 
##  [70] stringi_1.2.4        RSQLite_2.1.1        S4Vectors_0.18.3    
##  [73] foreach_1.4.4        checkmate_1.8.5      BiocGenerics_0.26.0 
##  [76] caTools_1.17.1       SDMTools_1.1-221     rlang_0.2.1         
##  [79] pkgconfig_2.0.2      dtw_1.20-1           prabclus_2.2-6      
##  [82] bitops_1.0-6         evaluate_0.11        lattice_0.20-35     
##  [85] ROCR_1.0-7           purrr_0.2.5          bindr_0.1.1         
##  [88] htmlwidgets_1.2      bit_1.1-14           tidyselect_0.2.4    
##  [91] plyr_1.8.4           magrittr_1.5         R6_2.3.0            
##  [94] IRanges_2.14.10      snow_0.4-3           gplots_3.0.1        
##  [97] Hmisc_4.1-1          DBI_1.0.0            pillar_1.3.0        
## [100] foreign_0.8-70       withr_2.1.2          fitdistrplus_1.0-9  
## [103] mixtools_1.1.0       survival_2.42-6      RCurl_1.95-4.11     
## [106] nnet_7.3-12          tibble_1.4.2         tsne_0.1-3          
## [109] crayon_1.3.4         hdf5r_1.0.0          KernSmooth_2.23-15  
## [112] rmarkdown_1.10       progress_1.2.0       grid_3.5.1          
## [115] data.table_1.11.8    blob_1.1.1           metap_0.9           
## [118] digest_0.6.17        diptest_0.75-7       tidyr_0.8.1         
## [121] R.utils_2.7.0        stats4_3.5.1         munsell_0.5.0
## [1] "Seurat  2.3.4"

Clean Metadata

Filter & Normalize Data

Gene Biotypes

Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html

## Seurat::FindGeneTerms() # Enrichr API
## Seurat::MultiModal_CCA() # Integrates data from disparate datasets (CIA version too)
if(!is.null(subsetGenes)){
  # If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
  if(file_test("-f", "Data/gene_biotypes.csv")){
    biotypes <- read.csv("Data/gene_biotypes.csv") 
  }
  else {
    ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
                     dataset="hsapiens_gene_ensembl") 
    ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
    listFilters(ensembl)
    listAttributes(ensembl)   
    biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
          values=row.names(pbmc@data), mart=ensembl) 
    write.csv(biotypes, "Data/gene_biotypes.csv", quote=F, row.names=F)
  } 
  # Subset data by creating new Seurat object (annoying but necessary)
  geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"] 
  
  print(paste(dim(pbmc@raw.data[geneSubset, ])[1],"/", dim(pbmc@raw.data)[1], 
              "genes are", subsetGenes))
  # Add back into pbmc 
  subset.matrix <- pbmc@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
  pbmc_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
  orig.ident <- row.names(pbmc@meta.data) # Pull the identities from the original Seurat object as a data.frame
  pbmc_sub <- AddMetaData(object = pbmc_sub, metadata = pbmc@meta.data) # Add the idents to the meta.data slot
  pbmc_sub <- SetAllIdent(object = pbmc_sub, id = "ident") # Assign identities for the new Seurat object
  pbmc <- pbmc_sub
  rm(pbmc_sub)
  pbmc
} 
## [1] "14827 / 24914 genes are protein_coding"
## An object of class seurat in project SeuratProject 
##  14827 genes across 27863 samples.

Diagnostic Plots

Gene Plots

percent.mito plot

nGene plot

Dimensionality Reduction

PCA

ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above

  • Other Dim Reduction Methods in Seurat
    • RunCCA()
    • RunMultiCCA()
    • RunDiffusion()
    • RunPHATE()
    • RunUMAP()
    • RunICA()

Significant PCs

Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time

Find Cell Clusters

We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.

## Parameters used in latest FindClusters calculation run on: 2019-01-03 17:34:49
## =============================================================================
## Resolution: 2
## -----------------------------------------------------------------------------
## Modularity Function    Algorithm         n.start         n.iter
##      1                   1                 100             10
## -----------------------------------------------------------------------------
## Reduction used          k.param          prune.SNN
##      pca                 30                0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10

Cluster Biomarkers

NA

## Error in kable_styling(., bootstrap_options = kableStyle): could not find function "kable_styling"

Top Biomarker Plots

Ridgeplot

## Picking joint bandwidth of 0.0784
## Picking joint bandwidth of 0.0375
## Picking joint bandwidth of 0.339
## Picking joint bandwidth of 0.289
## Picking joint bandwidth of 0.427
## Picking joint bandwidth of 0.112
## Picking joint bandwidth of 0.355
## Picking joint bandwidth of 0.129
## Picking joint bandwidth of 0.0345
## Picking joint bandwidth of 0.033
## Picking joint bandwidth of 0.0581

Cell Sub-clusters

Further subdivisions within cell types.
If you perturb some of our parameter choices above (for example, setting resolution=0.8 or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite object@ident), we can stash our renamed identities to be easily recovered later.